library(class)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(e1071)
library(magrittr)
library(XML)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
library(stringi)
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
##
## xml
library(ggplot2)
library(RCurl)
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(httr)
##
## Attaching package: 'httr'
## The following object is masked from 'package:plotly':
##
## config
## The following object is masked from 'package:caret':
##
## progress
library(jsonlite)
library(e1071)
library(naniar)
library(scales)
library(gghighlight)
library(stringr)
library(knitr)
knitr::opts_chunk$set(error = TRUE)
beers=read.csv('/Users/renfengwang/Documents/SMU\ Data\ Science\ Program/Doing\ Data\ Science/Project\ 1/Beers.csv',header=T)
brewery=read.csv('/Users/renfengwang/Documents/SMU\ Data\ Science\ Program/Doing\ Data\ Science/Project\ 1/Breweries.csv',header=T)
head(beers)
## Name Beer_ID ABV IBU Brewery_id
## 1 Pub Beer 1436 0.050 NA 409
## 2 Devil's Cup 2265 0.066 NA 178
## 3 Rise of the Phoenix 2264 0.071 NA 178
## 4 Sinister 2263 0.090 NA 178
## 5 Sex and Candy 2262 0.075 NA 178
## 6 Black Exodus 2261 0.077 NA 178
## Style Ounces
## 1 American Pale Lager 12
## 2 American Pale Ale (APA) 12
## 3 American IPA 12
## 4 American Double / Imperial IPA 12
## 5 American IPA 12
## 6 Oatmeal Stout 12
head(brewery)
## Brew_ID Name City State
## 1 1 NorthGate Brewing Minneapolis MN
## 2 2 Against the Grain Brewery Louisville KY
## 3 3 Jack's Abby Craft Lagers Framingham MA
## 4 4 Mike Hess Brewing Company San Diego CA
## 5 5 Fort Point Beer Company San Francisco CA
## 6 6 COAST Brewing Company Charleston SC
#Question 1
brewery %>% arrange(State) %>% ggplot(aes(x=State),count=Name)+geom_bar()+geom_text(aes(label=..count..),stat='count',vjust=-.5) +
xlab('States')+ ylab('Brewery Numbers') + ggtitle('Numbers of Brewery by State') #Bar plot to count the brewery numbers in each state
#Question 2
colnames(beers)[colnames(beers)=='Brewery_id']='Brew_ID' #Change one of the data frame column names before merging two data sets.
df_beer=merge(beers, brewery, by='Brew_ID', all=T) #Outer join two data frames.
colnames(df_beer)[colnames(df_beer)=='Name.x']='Beer_Name'
colnames(df_beer)[colnames(df_beer)=='Name.y']='Brewery_Name'
head(df_beer)
## Brew_ID Beer_Name Beer_ID ABV IBU Style
## 1 1 Get Together 2692 0.045 50 American IPA
## 2 1 Maggie's Leap 2691 0.049 26 Milk / Sweet Stout
## 3 1 Wall's End 2690 0.048 19 English Brown Ale
## 4 1 Pumpion 2689 0.060 38 Pumpkin Ale
## 5 1 Stronghold 2688 0.060 25 American Porter
## 6 1 Parapet ESB 2687 0.056 47 Extra Special / Strong Bitter (ESB)
## Ounces Brewery_Name City State
## 1 16 NorthGate Brewing Minneapolis MN
## 2 16 NorthGate Brewing Minneapolis MN
## 3 16 NorthGate Brewing Minneapolis MN
## 4 16 NorthGate Brewing Minneapolis MN
## 5 16 NorthGate Brewing Minneapolis MN
## 6 16 NorthGate Brewing Minneapolis MN
tail(df_beer)
## Brew_ID Beer_Name Beer_ID ABV IBU
## 2405 556 Pilsner Ukiah 98 0.055 NA
## 2406 557 Heinnieweisse Weissebier 52 0.049 NA
## 2407 557 Snapperhead IPA 51 0.068 NA
## 2408 557 Moo Thunder Stout 50 0.049 NA
## 2409 557 Porkslap Pale Ale 49 0.043 NA
## 2410 558 Urban Wilderness Pale Ale 30 0.049 NA
## Style Ounces Brewery_Name City
## 2405 German Pilsener 12 Ukiah Brewing Company Ukiah
## 2406 Hefeweizen 12 Butternuts Beer and Ale Garrattsville
## 2407 American IPA 12 Butternuts Beer and Ale Garrattsville
## 2408 Milk / Sweet Stout 12 Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA) 12 Butternuts Beer and Ale Garrattsville
## 2410 English Pale Ale 12 Sleeping Lady Brewing Company Anchorage
## State
## 2405 CA
## 2406 NY
## 2407 NY
## 2408 NY
## 2409 NY
## 2410 AK
#Question 3
df_beerall=df_beer %>% replace_with_na_all(condition = ~.x=='')
df_beerall=df_beerall %>% replace_with_na_all(condition = ~.x==' ')
gg_miss_var(df_beerall)
table(is.na(df_beerall$IBU))
##
## FALSE TRUE
## 1405 1005
table(is.na(df_beerall$ABV))
##
## FALSE TRUE
## 2348 62
We can see there are missing values in both ABV and IBU columns. In ABV column, there are 62 missing values. In IBU column, there are 1005 missing values.
#Question 4
Median_ABV_IBU=df_beerall %>% arrange(State) %>% group_by(State) %>% summarize(Median_ABV=median(ABV, na.rm=TRUE), Median_IBU=median(IBU,na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
Median_ABV_IBU %>% ggplot(aes(x=State, y=Median_ABV,width=.5))+geom_bar(stat='identity') +
geom_text(aes(label=percent(Median_ABV, accuracy = 0.01)),vjust=-.5,size=2.5,check_overlap = T) +
xlab('States') +ylab('Median Alcoholic Content')+ggtitle('Median Alcoholic Content by State')
Median_ABV_IBU %>% ggplot(aes(x=State, y=Median_IBU,width=.5))+geom_bar(stat='identity') +
geom_text(aes(label=Median_IBU),vjust=-.5,size=2.5,check_overlap = T) +
xlab('States') +ylab('Median International Bitterness')+ggtitle('Median International Bitterness by State') #Removed state=SD as NA
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).
State South Dakota doesn’t have IBU values, but has ABV values.
#Question 5
Max_ABV_IBU=df_beerall %>% arrange(State) %>% group_by(State) %>% summarize(Max_ABV=max(ABV, na.rm=TRUE), Max_IBU=max(IBU,na.rm=T))
## Warning in max(IBU, na.rm = T): no non-missing arguments to max; returning -Inf
## `summarise()` ungrouping output (override with `.groups` argument)
Max_ABV_IBU %>% ggplot(aes(x=State, y=Max_ABV,width=.5))+geom_bar(stat='identity') +
geom_text(aes(label=percent(Max_ABV, accuracy = 0.01)),vjust=-.5,size=2.5,check_overlap = T) +
xlab('States') +ylab('Max Alcoholic Content')+ggtitle('Max Alcoholic Content by State')
Max_ABV_IBU %>% ggplot(aes(x=State, y=Max_IBU,width=.5))+geom_bar(stat='identity') +
geom_text(aes(label=Max_IBU),vjust=-.5,size=2.5,check_overlap = T) +
xlab('States') +ylab('Max International Bitterness')+ggtitle('Max International Bitterness by State')
## Warning: Removed 1 rows containing missing values (geom_bar).
We can see Colorado has the maximum Alcoholic beer with ABV 12.8% and Oregon has the most bitter beer with IBU 138.
#Question 6
df_beerall%>% summarize(Mean=mean(ABV, na.rm=TRUE),
Median=median(ABV,na.rm=T),
Min=min(ABV,na.rm=T),
Max=max(ABV,na.rm=T),
SD=sd(ABV,na.rm=T),
N=n())
## # A tibble: 1 x 6
## Mean Median Min Max SD N
## <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 0.0598 0.056 0.001 0.128 0.0135 2410
df_beer %>% filter(!is.na(ABV)) %>% ggplot(aes(x=ABV))+geom_histogram(aes(y=..density..),colour='black',fill='white')+
geom_density(alpha=.5, fill='#FF6666')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The distribution of ABV is right skewed. ABV of beers around 5% has the most counts. There are total 2410 non-missing ABV values in this data set. The maximum ABV is 12.8%, the minimum ABV is .1%, the median ABV is 5.6%. The mean ABV is 5.98% and standard deviation of ABV is 1.35%.
#Question 7
df_beer %>% filter(!is.na(ABV) &!is.na(IBU)) %>%
ggplot(aes(y=ABV, x=IBU))+geom_point(position='jitter')+geom_smooth(method=loess)
## `geom_smooth()` using formula 'y ~ x'
Most beers with lower IBU (less than 50) have ABV values around 5%. When IBU value increases, ABV values spreads out. But most beers with IBU values above 50, their ABV values spread out within the region between 5% and 10%.
#Question 8 KNN
df_beer_IPA=df_beer %>% filter(!is.na(ABV) &!is.na(IBU)) %>%
filter(str_detect(Style, regex(str_c('\\b','IPA','\\b',sep=''), ignore_case = T)))
df_beer_IPA$Style=as.factor('IPA')
df_beer_Ale=df_beer %>% filter(!is.na(ABV) &!is.na(IBU)) %>%
filter(str_detect(Style, regex(str_c('\\b','Ale','\\b',sep=''), ignore_case = T)))
df_beer_Ale$Style=as.factor('Ale')
df_beer_test=rbind(df_beer_IPA, df_beer_Ale)
iterations = 500
numks = 30
splitPerc = .7
masterAcc = matrix(nrow = iterations, ncol = numks)
set.seed(6)
trainIndices = sample(1:dim(df_beer_test)[1],round(splitPerc * dim(df_beer_test)[1]))
beer_train = df_beer_test[trainIndices,]
beer_test = df_beer_test[-trainIndices,]
classifications=knn(beer_train[,c(4,5)],beer_test[,c(4,5)],beer_train$Style, prob = TRUE, k = 5)
CM = confusionMatrix(table(classifications,beer_test$Style))
classifications
## [1] IPA Ale Ale Ale IPA Ale IPA IPA IPA Ale IPA IPA Ale IPA IPA IPA IPA IPA
## [19] Ale IPA IPA IPA IPA Ale Ale IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA
## [37] IPA IPA IPA IPA IPA IPA IPA IPA Ale IPA IPA IPA IPA IPA IPA IPA IPA IPA
## [55] IPA Ale IPA IPA Ale IPA IPA IPA Ale IPA IPA IPA IPA IPA IPA IPA IPA Ale
## [73] IPA IPA IPA IPA IPA Ale IPA IPA Ale IPA Ale IPA IPA IPA IPA Ale Ale IPA
## [91] IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA Ale
## [109] IPA IPA IPA IPA Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale
## [127] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [145] Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale IPA Ale IPA Ale Ale Ale
## [163] IPA Ale Ale IPA Ale Ale Ale Ale Ale IPA IPA Ale Ale IPA Ale Ale Ale Ale
## [181] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale
## [199] IPA Ale IPA Ale IPA Ale IPA Ale IPA Ale Ale Ale Ale Ale Ale IPA IPA Ale
## [217] IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA IPA Ale Ale Ale Ale
## [235] IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA IPA Ale Ale
## [253] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale
## [271] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA IPA Ale
## attr(,"prob")
## [1] 0.6000000 0.7500000 0.8000000 1.0000000 1.0000000 0.6666667 1.0000000
## [8] 1.0000000 0.6666667 1.0000000 0.8333333 1.0000000 1.0000000 1.0000000
## [15] 1.0000000 0.8571429 1.0000000 1.0000000 0.5000000 0.6666667 1.0000000
## [22] 1.0000000 0.6000000 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000
## [29] 0.9166667 1.0000000 0.8000000 0.8333333 0.8333333 1.0000000 1.0000000
## [36] 1.0000000 1.0000000 1.0000000 1.0000000 0.9000000 0.7142857 1.0000000
## [43] 1.0000000 0.8000000 1.0000000 0.8571429 0.7142857 1.0000000 1.0000000
## [50] 1.0000000 0.8333333 0.8571429 0.8571429 0.8571429 1.0000000 1.0000000
## [57] 1.0000000 1.0000000 1.0000000 0.6666667 1.0000000 0.8000000 1.0000000
## [64] 0.8000000 0.8571429 0.7777778 0.8333333 1.0000000 1.0000000 0.8888889
## [71] 0.8333333 0.8571429 0.7777778 1.0000000 0.8000000 0.8888889 0.6000000
## [78] 0.6666667 0.8571429 0.8000000 0.8333333 1.0000000 0.8000000 1.0000000
## [85] 0.6666667 0.8888889 1.0000000 0.8000000 0.6000000 0.6000000 1.0000000
## [92] 1.0000000 0.9000000 0.8333333 0.8333333 1.0000000 1.0000000 1.0000000
## [99] 0.9000000 0.8888889 0.8888889 1.0000000 0.6000000 0.8571429 0.7500000
## [106] 1.0000000 1.0000000 0.8888889 0.8750000 0.8571429 0.8333333 1.0000000
## [113] 0.7500000 0.7500000 0.5555556 1.0000000 1.0000000 0.8888889 1.0000000
## [120] 1.0000000 0.5000000 1.0000000 1.0000000 1.0000000 0.8571429 0.5000000
## [127] 0.8333333 0.6000000 1.0000000 0.8750000 1.0000000 1.0000000 0.8333333
## [134] 1.0000000 1.0000000 0.8333333 1.0000000 1.0000000 1.0000000 0.7777778
## [141] 1.0000000 1.0000000 1.0000000 0.9000000 1.0000000 1.0000000 1.0000000
## [148] 1.0000000 0.7142857 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000
## [155] 1.0000000 1.0000000 0.6666667 0.8571429 0.8571429 0.6000000 1.0000000
## [162] 0.8000000 0.5714286 0.8000000 0.8000000 0.6000000 0.7500000 0.8000000
## [169] 1.0000000 0.8000000 1.0000000 0.6000000 1.0000000 1.0000000 1.0000000
## [176] 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8000000 0.8571429
## [183] 1.0000000 1.0000000 0.7500000 0.7500000 0.7500000 1.0000000 1.0000000
## [190] 1.0000000 0.6000000 0.5000000 0.8571429 0.8000000 1.0000000 0.8333333
## [197] 1.0000000 0.8000000 0.6000000 1.0000000 0.5714286 1.0000000 1.0000000
## [204] 1.0000000 1.0000000 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000
## [211] 1.0000000 1.0000000 0.8571429 0.8571429 0.6666667 1.0000000 0.7142857
## [218] 1.0000000 1.0000000 1.0000000 0.8333333 1.0000000 1.0000000 1.0000000
## [225] 1.0000000 1.0000000 1.0000000 1.0000000 0.6666667 1.0000000 1.0000000
## [232] 1.0000000 1.0000000 0.8000000 1.0000000 0.8000000 1.0000000 1.0000000
## [239] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [246] 1.0000000 0.6000000 1.0000000 0.8333333 0.5555556 0.6666667 1.0000000
## [253] 1.0000000 0.5555556 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [260] 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [267] 1.0000000 1.0000000 1.0000000 1.0000000 0.5555556 1.0000000 1.0000000
## [274] 1.0000000 1.0000000 1.0000000 1.0000000 0.7500000 1.0000000 1.0000000
## [281] 1.0000000 1.0000000 1.0000000 0.8333333 1.0000000
## Levels: IPA Ale
CM
## Confusion Matrix and Statistics
##
##
## classifications IPA Ale
## IPA 92 26
## Ale 23 144
##
## Accuracy : 0.8281
## 95% CI : (0.7792, 0.87)
## No Information Rate : 0.5965
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6443
##
## Mcnemar's Test P-Value : 0.7751
##
## Sensitivity : 0.8000
## Specificity : 0.8471
## Pos Pred Value : 0.7797
## Neg Pred Value : 0.8623
## Prevalence : 0.4035
## Detection Rate : 0.3228
## Detection Prevalence : 0.4140
## Balanced Accuracy : 0.8235
##
## 'Positive' Class : IPA
##
I randomly assigned 70% of original data to training data and 30% of the original data as testing data when seed value is set to 6. We can tell when k=5, the accuracy is about 82.81%.
for(j in 1:iterations)
{
accs = data.frame(accuracy = numeric(30), k = numeric(30))
trainIndices = sample(1:dim(df_beer_test)[1],round(splitPerc * dim(df_beer_test)[1]))
beer_train = df_beer_test[trainIndices,]
beer_test = df_beer_test[-trainIndices,]
for(i in 1:numks)
{
classifications = knn(beer_train[,c(4,5)],beer_test[,c(4,5)],beer_train$Style, prob = TRUE, k = i)
table(classifications,beer_test$Style)
CM = confusionMatrix(table(classifications,beer_test$Style))
masterAcc[j,i] = CM$overall[1]
}
}
MeanAcc = colMeans(masterAcc)
p=ggplot(mapping=aes(x=seq(1,numks,1), y=MeanAcc))+geom_line()
ggplotly(p)
I shuffled the training and testing data 500 times by 70%/30% split. And assigned integer k value from 1 to 30. From the plot, we can tell when k=5, it gives us the highest accuracy 85% which means we can predict the beer is either India Pale Ales or any other types of Ale by knowing its ABV and IBU values with 85% accuracy when sets nearest neighbor numbers equals to 5
#Naive Bayes--------------
iterations = 500
masterAcc = matrix(nrow = iterations)
masterSen = matrix(nrow = iterations)
masterSpec = matrix(nrow = iterations)
splitPerc = .7
set.seed(6)
for(j in 1:iterations)
{
trainIndices = sample(1:dim(df_beer_test)[1],round(splitPerc * dim(df_beer_test)[1]))
beer_train = df_beer_test[trainIndices,]
beer_test = df_beer_test[-trainIndices,]
model = naiveBayes(beer_train[,c(4,5)],as.factor(beer_train$Style),laplace = 1)
table(predict(model,beer_test[,c(4,5)]),as.factor(beer_test$Style))
CM = confusionMatrix(table(predict(model,beer_test[,c(4,5)]),as.factor(beer_test$Style)))
masterAcc[j] = CM$overall[1]
masterSen[j] = CM$byClass[1]
masterSpec[j] = CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanSen = colMeans(masterSen)
MeanSpec = colMeans(masterSpec)
MeanAcc
## [1] 0.8432491
MeanSen
## [1] 0.808263
MeanSpec
## [1] 0.8679429
Now I chose to use Naive Bayes model as I want to compare the accuracy with KNN model. I shuffled the training and testing data 500 times by 70%/30% split like I did in KNN model. The mean accuracy of Naive Bayes is 84% which is similar to what we got from KNN.
#Question 9 Sorted Lager, Stout, IPA
df_beer_stout=df_beer %>% filter(!is.na(ABV) &!is.na(IBU)) %>%
filter(str_detect(Style, regex(str_c('\\b','stout','\\b',sep=''), ignore_case = T)))
df_beer_lager=df_beer %>% filter(!is.na(ABV) &!is.na(IBU)) %>%
filter(str_detect(Style, regex(str_c('\\b','lager','\\b',sep=''), ignore_case = T)))
df_beer_stout$Style=as.factor('Stout')
df_beer_lager$Style=as.factor('Lager')
df_beer_sort=rbind(df_beer_IPA, df_beer_stout)
df_beer_sort=rbind(df_beer_sort, df_beer_lager)
iterations = 500
masterAcc = matrix(nrow = iterations)
masterSen = matrix(nrow = iterations)
masterSpec = matrix(nrow = iterations)
splitPerc = .7
set.seed(6)
for(j in 1:iterations)
{
trainIndices = sample(1:dim(df_beer_sort)[1],round(splitPerc * dim(df_beer_sort)[1]))
beer_sort_train = df_beer_sort[trainIndices,]
beer_sort_test = df_beer_sort[-trainIndices,]
model = naiveBayes(beer_sort_train[,c(4,5)],as.factor(beer_sort_train$Style),laplace = 1)
table(predict(model,beer_sort_test[,c(4,5)]),as.factor(beer_sort_test$Style))
CM = confusionMatrix(table(predict(model,beer_sort_test[,c(4,5)]),as.factor(beer_sort_test$Style)))
masterAcc[j] = CM$overall[1]
masterSen[j] = CM$byClass[1]
masterSpec[j] = CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanSen = colMeans(masterSen)
MeanSpec = colMeans(masterSpec)
MeanAcc
## [1] 0.8469811
MeanSen
## [1] 0.9374376
MeanSpec
## [1] 0.01695888
Now I want to use IBU and ABV values to predict non-IPA Ales, lager and stout these three beer styles. I shuffled the training and testing data 500 times by 70%/30% split like I did before. First I used Naive Bayes method. The mean accuracy is about 84.5%.
iterations = 500
numks = 30
splitPerc = .7
masterAcc = matrix(nrow = iterations, ncol = numks)
set.seed(6)
for(j in 1:iterations)
{
accs = data.frame(accuracy = numeric(30), k = numeric(30))
trainIndices = sample(1:dim(df_beer_sort)[1],round(splitPerc * dim(df_beer_sort)[1]))
beer_sort_train = df_beer_sort[trainIndices,]
beer_sort_test = df_beer_sort[-trainIndices,]
for(i in 1:numks)
{
classifications = knn(beer_sort_train[,c(4,5)],beer_sort_test[,c(4,5)],beer_sort_train$Style, prob = TRUE, k = i)
table(classifications,beer_sort_test$Style)
CM = confusionMatrix(table(classifications,beer_sort_test$Style))
masterAcc[j,i] = CM$overall[1]
}
}
MeanAcc = colMeans(masterAcc)
e=ggplot(mapping=aes(x=seq(1,numks,1), y=MeanAcc))+geom_line()
ggplotly(e)
MeanAcc
## [1] 0.8788553 0.8594088 0.8674843 0.8640252 0.8640503 0.8643648 0.8668805
## [8] 0.8694969 0.8716478 0.8732704 0.8740881 0.8745031 0.8752075 0.8752075
## [15] 0.8756855 0.8758365 0.8760377 0.8760881 0.8765660 0.8767925 0.8770314
## [22] 0.8770943 0.8771321 0.8769937 0.8768050 0.8768302 0.8768050 0.8766541
## [29] 0.8768428 0.8769057
Now I used KNN method. The highest mean accuracy is 87.9% when k=1.We can see from the plot that accuracy dropped when k=2, but increased when k>=6. Then the accuracy keeps arising.
set.seed(4)
trainIndices = sample(1:dim(df_beer_sort)[1],round(splitPerc * dim(df_beer_sort)[1]))
beer_sort_train = df_beer_sort[trainIndices,]
beer_sort_test = df_beer_sort[-trainIndices,]
classifications_knn=knn(beer_sort_train[,c(4,5)],beer_sort_test[,c(4,5)],beer_sort_train$Style, prob = TRUE, k = 5)
CM_knn = confusionMatrix(table(classifications,beer_sort_test$Style))
classifications_knn
## [1] IPA IPA IPA IPA IPA IPA IPA IPA Stout IPA IPA IPA
## [13] IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA
## [25] IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA
## [37] IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA
## [49] IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA
## [61] IPA Lager Stout IPA IPA IPA IPA IPA IPA IPA IPA IPA
## [73] IPA IPA IPA IPA IPA IPA Stout IPA IPA IPA Lager IPA
## [85] Stout IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA
## [97] IPA IPA IPA IPA IPA IPA IPA IPA Lager IPA IPA IPA
## [109] IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA
## [121] IPA IPA IPA IPA IPA Stout IPA Lager Stout IPA Lager IPA
## [133] IPA Lager IPA Lager Lager Lager Lager Lager Lager Lager Lager Lager
## [145] Lager Lager Lager Lager Lager IPA Lager Lager Lager Lager Lager Lager
## [157] Lager Lager Lager
## attr(,"prob")
## [1] 1.0000000 1.0000000 1.0000000 1.0000000 0.8333333 0.6000000 0.8000000
## [8] 1.0000000 0.6000000 0.7142857 1.0000000 1.0000000 0.8888889 0.8888889
## [15] 0.8888889 1.0000000 1.0000000 0.8000000 1.0000000 1.0000000 0.6666667
## [22] 1.0000000 1.0000000 1.0000000 1.0000000 0.8888889 1.0000000 1.0000000
## [29] 1.0000000 1.0000000 1.0000000 1.0000000 0.7500000 1.0000000 1.0000000
## [36] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8000000 0.8750000
## [43] 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [50] 1.0000000 1.0000000 1.0000000 1.0000000 0.8333333 1.0000000 1.0000000
## [57] 1.0000000 1.0000000 1.0000000 0.6250000 1.0000000 0.5000000 0.5714286
## [64] 1.0000000 0.5555556 1.0000000 0.8888889 0.8750000 1.0000000 1.0000000
## [71] 0.8333333 0.8000000 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000
## [78] 0.8333333 0.4000000 1.0000000 1.0000000 1.0000000 0.6000000 1.0000000
## [85] 0.5000000 0.8333333 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [92] 0.5000000 1.0000000 1.0000000 0.8333333 1.0000000 1.0000000 1.0000000
## [99] 1.0000000 1.0000000 1.0000000 0.8333333 0.8000000 1.0000000 0.6000000
## [106] 1.0000000 0.8333333 0.8000000 0.8000000 0.6666667 0.6666667 0.8888889
## [113] 1.0000000 1.0000000 1.0000000 1.0000000 0.4000000 1.0000000 1.0000000
## [120] 1.0000000 1.0000000 0.8333333 0.8000000 0.8571429 0.7142857 0.5000000
## [127] 0.7500000 0.7142857 0.4000000 0.8888889 0.6666667 0.5000000 0.4000000
## [134] 1.0000000 0.6000000 0.6666667 1.0000000 0.6666667 0.6666667 1.0000000
## [141] 0.5714286 0.6000000 1.0000000 1.0000000 1.0000000 0.8000000 1.0000000
## [148] 1.0000000 1.0000000 0.6250000 0.8000000 1.0000000 0.6666667 1.0000000
## [155] 1.0000000 1.0000000 0.6666667 1.0000000 0.8000000
## Levels: IPA Stout Lager
CM_knn
## Confusion Matrix and Statistics
##
##
## classifications IPA Stout Lager
## IPA 120 8 5
## Stout 0 0 0
## Lager 4 4 18
##
## Overall Statistics
##
## Accuracy : 0.8679
## 95% CI : (0.8052, 0.9163)
## No Information Rate : 0.7799
## P-Value [Acc > NIR] : 0.003347
##
## Kappa : 0.5924
##
## Mcnemar's Test P-Value : 0.007012
##
## Statistics by Class:
##
## Class: IPA Class: Stout Class: Lager
## Sensitivity 0.9677 0.00000 0.7826
## Specificity 0.6286 1.00000 0.9412
## Pos Pred Value 0.9023 NaN 0.6923
## Neg Pred Value 0.8462 0.92453 0.9624
## Prevalence 0.7799 0.07547 0.1447
## Detection Rate 0.7547 0.00000 0.1132
## Detection Prevalence 0.8365 0.00000 0.1635
## Balanced Accuracy 0.7982 0.50000 0.8619
Here is one of the case I shuffled the testing and training data once and the accuracy we got is 85% when k=5
#Back to Question 7
df_beer_study=rbind(df_beer_sort, df_beer_Ale)
df_beer_study %>% ggplot(aes(y=ABV, x=IBU))+geom_point(position='jitter')+geom_smooth() +facet_wrap(~Style)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Now we came back to Question 7 by checking four different styles of beer. We can see, majority of lager style beer has low IBU and its ABV values are around 5%. No ABV values of lager beer are above 7.5%. ABV values of Indian Pale Ale style seems like increases when IBU increases, but their ABV values don’t pass 10% and majority of their ABV values are between 5% to 10%. The other types of Ale beer have low IBU values which most of them are below 50. Also, most of their ABV values are between 3.75% to 10%. The data size of stout style beer is small, but we can roughly tell their ABV increases when IBU increases.